### clean county level population, geography, and demography vars for 2005
# turn into descriptive timing regressions 
## last modified by Charlotte Ambrozek
## last modified date 27 December 2024s
library(tidyverse)
library(haven)
library(readr)
library(readxl)
library(broom)
library(AER)
library(stargazer)

#documentation about variable definitions available in co-est00int-agesex-5yr.pdf (Box/ERS_WIC_Coop/County variables)
ages <- read_csv("./data/raw/co-est00int-agesex-5yr.csv") %>%
  rename_all(tolower) %>% 
  filter(sumlev == "050") %>% #ensure all obs are county level
  select(!c(sumlev)) %>% #don't need sumlevel anymore
  filter(sex != 1) %>% # remove men only from the sample
  #maybe keep a measure of fertility age women
  filter((sex == 0 & agegrp == 1) | (sex == 0 & agegrp == 0) | (sex == 2 & (agegrp == 4 | agegrp == 5 | agegrp == 6 | agegrp == 7 | agegrp == 8 | agegrp == 9 | agegrp == 10))) %>%
  mutate(ct_fips = paste0(state, county)) %>%
  select(!c(state, county, stname, ctyname, estimatesbase2000, census2010pop))

ages <- pivot_longer(ages, cols = starts_with("popestimate"), names_to = "year", values_to = "pop") %>%
  mutate(year = as.numeric(substr(year, 12, 15))) %>%
  mutate(age_group = ifelse(agegrp == 0, "total",
                            ifelse(agegrp == 1, "under5", "women15to49"))) %>%
  select(!c(agegrp)) %>%
  group_by(ct_fips, year, age_group) %>%
  summarise(pop = sum(pop), .groups = "keep") %>%
  pivot_wider(names_from = age_group, values_from = pop, names_glue = "{age_group}_pop")

race <- read_csv("./data/raw/co-est00int-sexracehisp.csv") %>%
  rename_all(tolower) %>% 
  filter(sumlev == "050" & sex == 0) %>% #ensure all obs are county level and only using counts not divided by sex
  select(!c(sumlev, sex)) %>% #don't need sumlevel anymore
  mutate(ct_fips = paste0(state, county)) %>%
  select(!c(state, county, stname, ctyname, estimatesbase2000, census2010pop))

race <- pivot_longer(race, cols = starts_with("popestimate"), names_to = "year", values_to = "pop") %>%
  mutate(year = as.numeric(substr(year, 12, 15))) %>%
  mutate(hisp = ifelse(origin == 0, "total",
                            ifelse(origin == 1, "nothispanic", "hispanic")),
         race = ifelse(race == 0, "total",
                         ifelse(race == 1, "white",
                                ifelse(race == 2, "black", "other")))) %>%
  select(!c(origin)) %>%
  group_by(ct_fips, year, hisp, race) %>%
  summarise(pop = sum(pop), .groups = "keep") %>%
  pivot_wider(names_from = c(hisp, race), values_from = pop, names_glue = "{hisp}_{race}_pop") %>%
  select(!c(total_total_pop))

demos <- left_join(ages, race, by = c("ct_fips", "year"))

rm(race, ages)

regions <- read_csv("./data/raw/co-est00int-tot.csv") %>%
  rename_all(tolower) %>% 
  filter(sumlev == 50) %>%
  mutate(ct_fips =  as.character(state*1000 + county)) %>%
  mutate(ct_fips = ifelse(str_length(ct_fips) == 4, paste0("0",ct_fips), ct_fips)) %>%
  select(c(ct_fips, region, division)) %>%
  mutate(region = ifelse(region == 1, "northeast",
                         ifelse(region == 2, "midwest",
                                ifelse(region == 3, "south",
                                       ifelse(region == 4, "west", NA))))) %>% # look at documentation for division, there's too many to code
  mutate(region = as.factor(region), division = as.factor(division))

demos <- left_join(demos, regions, by = "ct_fips")

rm(regions)

gc()

urbanrural <- read_excel("./data/raw/NCHSURCodes2013.xlsx", .name_repair = "universal") %>%
  rename_all(tolower) %>% 
  mutate(ct_fips =  as.character(fips.code)) %>%
  mutate(ct_fips = ifelse(str_length(ct_fips) == 4, paste0("0",ct_fips), ct_fips)) %>%
  select(c(ct_fips, ..2006.code)) %>%
  mutate(urbanrural = factor(..2006.code, labels = c("lgcentralmetro", "lgfringemetro", "medmetro", "smmetro", "micropolitan", "noncore"))) %>%
  select(!c(..2006.code))

demos <- left_join(demos, urbanrural, by = "ct_fips")

rm(urbanrural)

gc()

demos <- mutate(demos, share_black = total_black_pop/total_pop,
                share_under5 = under5_pop/total_pop,
                share_hispanic = hispanic_total_pop/total_pop,
                share_nothispanic_white = nothispanic_white_pop/total_pop)

counties <- read_dta("./data/cleaned/countystats.dta") # from statestats.r ins cleaning code folder

income <- filter(counties, year == 2005) %>%
  ungroup() %>%
  select(!c(year)) %>%
  mutate(ct_fips = as.numeric(ct_fips))

rollout <- read_dta("./data/cleaned/ewic_rollout.dta") %>%
  select(c(ct_fips, ev_year)) %>%
  distinct()

demos <- filter(demos, year == 2005) %>%
  ungroup() %>%
  select(!c(year)) %>%
  mutate(ct_fips = as.numeric(ct_fips))

rollout <- left_join(rollout, demos, by = "ct_fips") %>%
  mutate(logpop = log(total_pop),
         state = as.factor(floor(ct_fips/1000)))

rollout <- left_join(rollout, income, by = "ct_fips") %>%
  rename(shareunder5 = share_under5,
        shareblack = share_black,
        sharehispanic = share_hispanic)

vars.order <- c("logpop", "shareunder5", "shareltwic", "shareunder5:shareltwic", "shareblack", "sharehispanic", "sharesnap", "sharecash", "medincome")

timing_reg <- lm(ev_year ~ logpop +
                          shareunder5 +
                          shareltwic +
                          shareunder5*shareltwic +
                          shareblack +
                          sharehispanic +
                          sharesnap +
                          sharecash +
                          medincome +
                          region +
                          urbanrural,
                        data = rollout)
robust_timing_reg <- coeftest(timing_reg, vcov = vcovCL, type = "HC1", cluster = ~ 
                        state, save = TRUE)

timingwinstate_reg <- lm(ev_year ~ logpop +
                           shareunder5 +
                           shareltwic +
                           shareunder5*shareltwic +
                           shareblack +
                           sharehispanic +
                           sharesnap +
                           sharecash +
                           medincome +
                           urbanrural +
                           state,
                         data = rollout)
robust_timingwinstate_reg <- coeftest(timingwinstate_reg, vcov = vcovCL, type = "HC1", cluster = ~ 
                                state, save = TRUE)

timingnoinc_reg <- lm(ev_year ~ logpop +
                   shareunder5 +
                   shareblack +
                   sharehispanic +
                   region +
                   urbanrural,
                 data = rollout)
robust_timingnoinc_reg <- coeftest(timingnoinc_reg, vcov = vcovCL, type = "HC1", cluster = ~ 
                                state, save = TRUE)

timingnoincwinstate_reg <- lm(ev_year ~ logpop +
                           shareunder5 +
                           shareblack +
                           sharehispanic +
                           urbanrural +
                           state,
                         data = rollout)
robust_timingnoincwinstate_reg <- coeftest(timingnoincwinstate_reg, vcov = vcovCL, type = "HC1", cluster = ~ 
                                        state, save = TRUE)

stargazer(timing_reg, timingwinstate_reg, timingnoinc_reg, timingnoincwinstate_reg,
          type = "latex",
          out = "./analysis/output/tables/timingregs.tex",
          order = paste0("^", vars.order, "$"),
          model.names = FALSE,
          column.labels = c("Population $>$ 65,000", "All counties"),
          column.separate = c(2, 2),
          dep.var.labels = "WIC EBT year",
          covariate.labels = c("Log population",
                               "Share $< 5$",
                               "Share $< 185$\\% FPL",
                               "(Share $< 5$) $\\times$ (Share $< 185$\\% FPL)",
                               "Share Black",
                               "Share Hispanic",
                               "Share SNAP",
                               "Share cash aid",
                               "Median income",
                               "Northeast",
                               "South",
                               "West"),
          add.lines = list(c("State FE", "No", "Yes", "No", "Yes")),
          omit = c("urbanrural", "state"),
          omit.stat = c("ser", "f", "rsq"),
          se = list(robust_timing_reg[, "Std. Error"],
                    robust_timingwinstate_reg[, "Std. Error"],
                    robust_timingnoinc_reg[, "Std. Error"],
                    robust_timingnoincwinstate_reg[, "Std. Error"]),
          notes = c("Includes indicators for urban-rural divides: large metro, large fringe metro,",
                    "medium metro, small metro, micropolitan, and non-core.",
                    "All of these are insignificant at 5\\%.",
                    "Uses heteroskedasticity robust s.e.s clustered at the state level."),
          notes.align = "l",
          notes.label = "",
          table.placement = "H")